Data Input and Preprocessing

Intro

We are using the “Now That’s What I Call Music(U.S. releases)” data set from Kaggle to predict the danceability of a song. Danceability is described with how much a song will get people todance from the tempo, rhythm stability, beat, and regulatiry. The scale of danceability ranges from 0 to 1, 0 being the least danceable while 1 is the most danceable song.

Preprocessing

Looking at the data there does not seem to be any null data within the database. From looking at the database it deo

library(rpart)
library(rpart.plot)
library(maptree)
## Loading required package: cluster
library(e1071)
library(cluster)

# the following utility files can be found attached to the assignment
source("https://raw.githubusercontent.com/grbruns/cst383/master/lin-regr-util.R")
source("https://raw.githubusercontent.com/grbruns/cst383/master/class-util.R")

dat=read.csv("data.csv")

sum(is.null(dat))
## [1] 0
dat$X = NULL
dat$loudness=abs(dat$loudness)
dat$song_title = NULL
dat$instrumentalness = NULL

dat$danceability=dat$danceability*100
dat$energy=dat$energy*100
dat$acousticness=dat$acousticness*100
dat$liveness=dat$liveness*100
dat$valence=dat$valence*100
dat$speechiness=dat$speechiness*100

Data Exploration

dat$song_title=NULL
summary(dat)
##   acousticness       danceability    duration_ms          energy     
##  Min.   : 0.00028   Min.   :12.20   Min.   :  16042   Min.   : 1.48  
##  1st Qu.: 0.96300   1st Qu.:51.40   1st Qu.: 200015   1st Qu.:56.30  
##  Median : 6.33000   Median :63.10   Median : 229261   Median :71.50  
##  Mean   :18.75900   Mean   :61.84   Mean   : 246306   Mean   :68.16  
##  3rd Qu.:26.50000   3rd Qu.:73.80   3rd Qu.: 270333   3rd Qu.:84.60  
##  Max.   :99.50000   Max.   :98.40   Max.   :1004627   Max.   :99.80  
##                                                                      
##       key            liveness        loudness           mode       
##  Min.   : 0.000   Min.   : 1.88   Min.   : 0.307   Min.   :0.0000  
##  1st Qu.: 2.000   1st Qu.: 9.23   1st Qu.: 4.746   1st Qu.:0.0000  
##  Median : 6.000   Median :12.70   Median : 6.248   Median :1.0000  
##  Mean   : 5.343   Mean   :19.08   Mean   : 7.086   Mean   :0.6123  
##  3rd Qu.: 9.000   3rd Qu.:24.70   3rd Qu.: 8.394   3rd Qu.:1.0000  
##  Max.   :11.000   Max.   :96.90   Max.   :33.097   Max.   :1.0000  
##                                                                    
##   speechiness         tempo        time_signature     valence     
##  Min.   : 2.310   Min.   : 47.86   Min.   :1.000   Min.   : 3.48  
##  1st Qu.: 3.750   1st Qu.:100.19   1st Qu.:4.000   1st Qu.:29.50  
##  Median : 5.490   Median :121.43   Median :4.000   Median :49.20  
##  Mean   : 9.266   Mean   :121.60   Mean   :3.968   Mean   :49.68  
##  3rd Qu.:10.800   3rd Qu.:137.85   3rd Qu.:4.000   3rd Qu.:69.10  
##  Max.   :81.600   Max.   :219.33   Max.   :5.000   Max.   :99.20  
##                                                                   
##      target                   artist    
##  Min.   :0.0000   Drake          :  16  
##  1st Qu.:0.0000   Rick Ross      :  13  
##  Median :1.0000   Disclosure     :  12  
##  Mean   :0.5057   Backstreet Boys:  10  
##  3rd Qu.:1.0000   WALK THE MOON  :  10  
##  Max.   :1.0000   Crystal Castles:   9  
##                   (Other)        :1947
fit=lm(danceability ~ acousticness+energy+liveness+loudness+speechiness+valence, data=dat)

summary(fit)
## 
## Call:
## lm(formula = danceability ~ acousticness + energy + liveness + 
##     loudness + speechiness + valence, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.345  -8.519   0.648   9.920  42.729 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  72.33759    2.48554  29.103  < 2e-16 ***
## acousticness -0.16090    0.01555 -10.349  < 2e-16 ***
## energy       -0.27008    0.02534 -10.659  < 2e-16 ***
## liveness     -0.09729    0.02030  -4.792 1.77e-06 ***
## loudness     -0.58480    0.12637  -4.628 3.93e-06 ***
## speechiness   0.21665    0.03420   6.335 2.92e-10 ***
## valence       0.30039    0.01295  23.202  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.68 on 2010 degrees of freedom
## Multiple R-squared:  0.2809, Adjusted R-squared:  0.2788 
## F-statistic: 130.9 on 6 and 2010 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(density(dat$danceability),col="red",main="Density of Features", ylim = c(0,.07))
lines(density(dat$energy),col="green")
lines(density(dat$liveness),col="blue")

legend("topright",legend=c("Dance","Energy","Liveness"),col=c("Red","Green","Blue"),lty=1,cex=.7)

plot(ecdf(dat$danceability),col="red",main="ECDF of Features")
lines(ecdf(dat$energy),col="green")
lines(ecdf(dat$liveness),col="blue")
legend("bottomright",legend=c("Dance","Energy","Liveness"),col=c("Red","Green","Blue"),lty=1,cex=.7)

plot(danceability ~ energy, data=dat, col="red",main="Danceability by Energy")
plot(danceability ~ liveness, data=dat, col="red",main="Danceability by Liveness")

set.seed(123)
split = split_data(dat)
tr_dat = split[[1]]
te_dat = split[[2]]

fit1=lm(danceability ~ energy+liveness, data=tr_dat)
predicts=predict(fit1, te_dat)
actual=te_dat$danceability
rng=range(c(predicts,actual))

plot(actual ~ predicts, pch=20, main="Predicted Danceablity Vs. Actual",col="red")
lines(c(rng[1],rng[2]),c(rng[1],rng[2]),lty=2,col="blue",lwd=1.5)

par(mfrow=c(2,2))
plot(fit1)

summary(fit1)
## 
## Call:
## lm(formula = danceability ~ energy + liveness, data = tr_dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.525 -10.447   0.845  12.048  36.668 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 61.40806    1.40251  43.784  < 2e-16 ***
## energy       0.04211    0.01987   2.120   0.0342 *  
## liveness    -0.13114    0.02637  -4.974 7.33e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.81 on 1509 degrees of freedom
## Multiple R-squared:  0.01689,    Adjusted R-squared:  0.01559 
## F-statistic: 12.96 on 2 and 1509 DF,  p-value: 2.614e-06
mse=mean((actual-predicts)^2)
rmse=sqrt(mse)
cat("\nRoot Mean Square: ",rmse)
## 
## Root Mean Square:  16.30534
te_dat$artist=NULL
te_dat$key=NULL
te_dat$mode=NULL
te_dat$target=NULL
te_dat$time_signature=NULL
te_dat$duration_ms=NULL
te_dat$tempo=NULL

par(mfrow=c(1,2))
hc=hclust(dist(te_dat),method="complete")
clusters=cutree(hc,10)
cols=rainbow(10)[clusters]
plot(danceability ~ energy,data=te_dat,col=cols,pch=16,main="Danceability by Energy, n=10")
plot(danceability ~ liveness,data=te_dat,col=cols,pch=16,main="Danceability by Liveness, n=10")

fit2=kmeans(te_dat,10)
cat("Betweenss of fit: ",fit2$betweenss)
## Betweenss of fit:  875822.6
cat("Totss of fit: ",fit2$totss)
## Totss of fit:  1193276
cat("Betweenss/Totss", (fit2$betweenss/fit2$totss))
## Betweenss/Totss 0.7339648
sil=silhouette(fit2$cluster,dist(te_dat))
cat("Silhouette Width", mean(sil[,3]))
## Silhouette Width 0.2102025
hc=hclust(dist(te_dat),method="complete")
clusters=cutree(hc,100)
cols=rainbow(100)[clusters]
plot(danceability ~ energy,data=te_dat,col=cols,pch=16,main="Danceability by Energy, n=100")
plot(danceability ~ liveness,data=te_dat,col=cols,pch=16,main="Danceability by Liveness, n=100")

fit3=kmeans(te_dat,100)

cat("Betweenss of fit: ",fit3$betweenss)
## Betweenss of fit:  1113266
cat("Totss of fit: ",fit3$totss)
## Totss of fit:  1193276
cat("Betweenss/Totss", (fit3$betweenss/fit3$totss))
## Betweenss/Totss 0.9329497
sil2=silhouette(fit3$cluster,dist(te_dat))
cat("Silhouette Width",mean(sil2[,3]))
## Silhouette Width 0.209537
fit = rpart(danceability ~ energy +liveness, data = tr_dat)
prp(fit, extra=1, varlen=-10, main="Regression Tree for Danceability (tr_dat)",box.col= "green")

predicted = predict(fit, te_dat)
errors = te_dat$danceability - predicted
rmse = sqrt(mean(errors^2))

plot_predict_actual(predicted, actual, 2, "Regression Tree Danceablility Prediction")

hist(errors, col = "red4", main = "Histogram of Errors, Regression Tree")

par(mfrow=c(2,2))
plot(density(dat$danceability),col="red",main="Density of Features")
lines(density(dat$energy),col="green")
lines(density(dat$valence),col="blue")

legend("topright",legend=c("Dance","Energy","Valence"),col=c("Red","Green","Blue"),lty=1,cex=.7)

plot(ecdf(dat$danceability),col="red",main="ECDF of Features")
lines(ecdf(dat$energy),col="green")
lines(ecdf(dat$valence),col="blue")
legend("bottomright",legend=c("Dance","Energy","Valence"),col=c("Red","Green","Blue"),lty=1,cex=.7)

plot(danceability ~ energy, data=dat, col="red",main="Danceability by Energy")
plot(danceability ~ valence, data=dat, col="red",main="Danceability by Valence")

fit4 = lm(danceability ~ energy+valence, data=tr_dat)
predicts1 = predict(fit4, newdata=te_dat, type="response")
actual1=te_dat$danceability
rang = range(c(predicts1, actual1))
par(mfrow=c(1,1))
plot(predicts1 ~ actual1, main="Predicting Danceability using Energy and Valence", ylab = "Prediction", xlab="Actuals", col="red", pch=20)
lines(c(rang[1], rang[2]),
      c(rang[1], rang[2]), col="blue",lty=2)

par(mfrow=c(2,2))
plot(fit4)

mse1=mean((actual1-predicts1)^2)
rmse1=sqrt(mse1)
cat("Root Mean Square: ",rmse1)
## Root Mean Square:  14.6752

From the graphs exploring the tree of the data of Valence and Energy to danceability using testing data and training and seeing the comparison of them.

par(mfrow=c(1,2))
hc=hclust(dist(te_dat),method="complete")
clusters=cutree(hc,10)
cols=rainbow(10)[clusters]
plot(danceability ~ energy,data=te_dat,col=cols,pch=16,main="Danceability by Energy, n=10")
plot(danceability ~ valence,data=te_dat,col=cols,pch=16,main="Danceability by Valence, n=10")

fit2=kmeans(te_dat,10)
cat("Betweenss of fit: ",fit2$betweenss)
## Betweenss of fit:  876718.2
cat("Totss of fit: ",fit2$totss)
## Totss of fit:  1193276
cat("Betweenss/Totss", (fit2$betweenss/fit2$totss))
## Betweenss/Totss 0.7347154
sil=silhouette(fit2$cluster,dist(te_dat))
cat("Silhouette Width", mean(sil[,3]))
## Silhouette Width 0.2158532
hc=hclust(dist(te_dat),method="complete")
clusters=cutree(hc,100)
cols=rainbow(100)[clusters]
plot(danceability ~ energy,data=te_dat,col=cols,pch=16,main="Danceability by Energy, n=100")
plot(danceability ~ valence,data=te_dat,col=cols,pch=16,main="Danceability by Valence, n=100")

fit3=kmeans(te_dat,100)

cat("Betweenss of fit: ",fit3$betweenss)
## Betweenss of fit:  1112912
cat("Totss of fit: ",fit3$totss)
## Totss of fit:  1193276
cat("Betweenss/Totss", (fit3$betweenss/fit3$totss))
## Betweenss/Totss 0.9326528
sil2=silhouette(fit3$cluster,dist(te_dat))
cat("Silhouette Width",mean(sil2[,3]))
## Silhouette Width 0.2038265
fit5 = rpart(danceability ~ energy +valence, data = tr_dat)
prp(fit5, extra=1, varlen=-10, main="Regression Tree for Danceability (tr_dat)",box.col= "green")

fit6 = rpart(danceability ~ energy +valence, data = te_dat)
prp(fit6, extra=1, varlen=-10, main="Regression Tree for Danceability (te_dat)",box.col= "green")

predicted = predict(fit5, te_dat)
errors = te_dat$danceability - predicted
rmse = sqrt(mean(errors^2))

plot_predict_actual(predicted, actual, 2, "Regression Tree Danceablility Prediction")

hist(errors, col = "red4", main = "Histogram of Errors, Regression Tree")

dat$dance=ifelse(dat$danceability>70,1,0)
fit=glm(dance ~ energy + valence, data=dat,family=binomial)
summary(fit)
## 
## Call:
## glm(formula = dance ~ energy + valence, family = binomial, data = dat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6541  -0.8792  -0.6233   1.0968   2.3357  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.393615   0.181184  -7.692 1.45e-14 ***
## energy      -0.015742   0.002607  -6.039 1.55e-09 ***
## valence      0.034094   0.002316  14.724  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2579.5  on 2016  degrees of freedom
## Residual deviance: 2324.9  on 2014  degrees of freedom
## AIC: 2330.9
## 
## Number of Fisher Scoring iterations: 4
#dat$dance=as.factor(dat$dance)

split = split_data(dat)
tr_dat = split[[1]]
te_dat = split[[2]]

fit=glm(dance ~ energy + valence,data=tr_dat,family=binomial)

y=predict(fit,newdata=te_dat,type="response")
predicts=as.numeric(y>.7)
actuals=te_dat$dance
table(predicts,actuals)
##         actuals
## predicts   0   1
##        0 314 181
##        1   0  10
cat("\nAccuracy of predictions: ",mean(predicts==actuals))
## 
## Accuracy of predictions:  0.6415842
fit1=glm(dance ~ energy, data=dat, family=binomial)
fit2=glm(dance ~ valence, data=dat, family=binomial)
temp1=data.frame(energy=seq(min(dat$energy),max(dat$energy),10))
temp2=data.frame(valence=seq(min(dat$valence),max(dat$valence),10))


probs1=predict(fit1,newdata=temp1,type="response")
probs2=predict(fit2,newdata=temp2,type="response")

plot(dance ~ energy,data=tr_dat,col="red",pch=20,ylab="P(danceability)")
lines(temp1$energy,probs1,col="blue",lwd=2)

plot(dance ~ valence,data=tr_dat,col="red",pch=20,ylab="P(danceability)")
lines(temp2$valence,probs2,col="blue",lwd=2)

fit.1=glm(dance ~ energy + valence, data=tr_dat,family=binomial)
predicts=predict(fit.1,te_dat,type="response")
actuals=te_dat$dance

plot(density(predicts[actuals==1]),col="red",xlab="Logistic Regression output",main="Double Density of Energy and Valence",ylim=c(0,3.0))
legend("topright",col=c("red","blue"),legend=c("Danceable","Not Danceable"),lty=1)
lines(density(predicts[actuals==0]),col="blue",lwd=2)